knitr::opts_chunk$set(
  warning = FALSE, message = FALSE,
  fig.width = 10, fig.height = 8
)
library(tidyverse)
library(magrittr)
library(limma)
library(edgeR)
library(ggrepel)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(ensembldb)
# library(plotly)
# library(RColorBrewer)
# library(clusterProfiler)
# library(org.Mm.eg.db)
# library(KEGGREST)
# library(fgsea)
# library(data.table)
library(scales)
library(broom)
library(glue)
library(pander)
theme_set(
  theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))
)

Data Preparation & Inspection

counts <- here::here("data", "genes.out.gz") %>% 
  read_tsv() %>% 
  rename_all(basename) %>% 
  rename_all(str_remove_all, pattern = "Aligned.sortedByCoord.out.bam")
samples <- tibble(
  sample = setdiff(colnames(counts), "Geneid"),
  condition = gsub("M2_|M3_|M5_|M6_", "", sample) %>% 
    factor(levels = c("DN", "DP", "LAG_3", "CD49b")),
  mouse = str_extract(sample, "M[0-9]"),
  label = glue("{condition} ({mouse})")
)
dgeList <- counts %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  as.matrix() %>%
  DGEList(
    samples = samples
  ) %>% 
  calcNormFactors(method = "TMM")
ah <- AnnotationHub()
#Find which genome did we use
# unique(ah$dataprovider)
# subset(ah, dataprovider == "Ensembl") %>%  #In Ensembl databse
#   subset(species == "Mus musculus") %>%  #under Mouse
#   subset(rdataclass == "EnsDb") 
ensDb <- ah[["AH69210"]] #This is the genome used for assigning reads to genes
genes <- genes(ensDb) %>% #extract the genes
  subset(seqnames %in% c(1:19, "MT", "X", "Y")) %>%
  keepStandardChromosomes() %>% 
  sortSeqlevels()
dgeList$genes <- data.frame(gene_id = rownames(dgeList)) %>% 
  left_join(
    mcols(genes) %>% 
      as.data.frame() %>% 
      dplyr::select(gene_id, gene_name, gene_biotype, description, entrezid),
    by = "gene_id"
  )
dgeList$samples %>%
  mutate(CellType = dgeList$samples$condition) %>%
  ggplot(aes(x = label, y = lib.size / 1e6, fill = CellType)) +
  geom_col() +
  geom_hline(
    yintercept = mean(dgeList$samples$lib.size / 1e6),
    linetype = 2, colour = "grey30"
  ) +
  labs(x = "Sample", y = "Library Size (millions)") +
  facet_wrap(~mouse, scales = "free_x", nrow = 1) +
  scale_x_discrete(labels = label_wrap(5)) +
  scale_y_continuous(expand = expansion(c(0, 0.05))) 
**Figure S7a** *Library sizes for all libraries after summarisation to gene-level counts. The mean library size across all libraries is shown as a dashed horizontal line.*

Figure S7a Library sizes for all libraries after summarisation to gene-level counts. The mean library size across all libraries is shown as a dashed horizontal line.

genes2keep <- dgeList %>% 
  cpm(log = TRUE) %>% 
  is_greater_than(1) %>% 
  rowSums() %>% 
  is_greater_than(4)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE]

Genes were only retained in the final dataset if 4 or more samples returned \(>1\) log2 Counts per Million (logCPM). The gave a dataset of 11,117 of the initial 55,450 genes which were retained for downstream analysis.

dgeFilt %>% 
  cpm(log = TRUE) %>% 
  as_tibble(rownames = "gene_id") %>% 
  pivot_longer(cols = all_of(colnames(dgeFilt)), names_to = "sample", values_to = "logCPM") %>% 
  left_join(dgeFilt$samples, by = "sample") %>% 
  ggplot(aes(logCPM, y = stat(density), colour = condition, group = sample)) +
  geom_density() +
  facet_wrap(~mouse) +
  scale_y_continuous(expand = expansion(c(0.01, 0.05)))
*logCPM densisties after removal of undetectable genes. The double negative (DN) samples for both m% and M6 appear to skew to lower overall counts compared to all other samples, with a handful of highly expressed genes likely to dominate the sample.*

logCPM densisties after removal of undetectable genes. The double negative (DN) samples for both m% and M6 appear to skew to lower overall counts compared to all other samples, with a handful of highly expressed genes likely to dominate the sample.

pca <- dgeFilt %>% 
  cpm(log = TRUE) %>% 
  t() %>% 
  prcomp()
pca %>% 
  broom::tidy() %>% 
  dplyr::rename(sample = row) %>% 
  dplyr::filter(PC %in% c(1, 2)) %>% 
  pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>% 
  left_join(dgeFilt$samples, by = "sample") %>% 
  ggplot(aes(PC1, PC2, colour = condition)) +
  geom_point(size = 3) +
  geom_text_repel(
    aes(label = label),#str_replace_all(label, " ", "\n")),
    max.overlaps = Inf
  ) +
  labs(
    x = glue("PC1 ({percent(summary(pca)$importance[2, 'PC1'])})"),
    y = glue("PC2 ({percent(summary(pca)$importance[2, 'PC2'])})"),
    colour = "Cell Type"
  )
**Figure S7b** *PCA on logCPM values, with the two DN samples identified above clearly showing strong divergence from the remainder of the dataset. The CD49b sample fro M2 also appeared slightly divergent, with the previous density plot also showing a slght skew towards lower overall counts.*

Figure S7b PCA on logCPM values, with the two DN samples identified above clearly showing strong divergence from the remainder of the dataset. The CD49b sample fro M2 also appeared slightly divergent, with the previous density plot also showing a slght skew towards lower overall counts.

The above PCA revealed some potential problems with two of the four DN samples. Exclusion of the clear outliers will reduce the number of viable samples within the DN group to 2 and as an alternative, a weighting strategy was instead sought for all samples.

Model Fitting

U <- matrix(1, nrow = ncol(dgeFilt)) %>% 
  set_colnames("Intercept")
v <- voomWithQualityWeights(dgeFilt, design = U)
X <- model.matrix(~0 + condition, data = dgeFilt$samples) %>% 
  set_colnames(str_remove(colnames(.), "condition"))
rho <- duplicateCorrelation(v, X, block=dgeFilt$samples$mouse)$consensus.correlation
v <- voomWithQualityWeights(
  counts = dgeFilt, design = U, block=dgeFilt$samples$mouse, correlation=rho
)

Sample-level weights were estimated by assuming all samples were drawn from the same group and running voomWithQualityWeights(). After running this, samples were compared within each mouse-of-origin and correlations within mice were estimated using duplicateCorrelation() (\(\rho=\) 0.118). voomWithQualityWeights() was then run again setting mouse as the blocking variable and including the consensus correlations

v$targets %>% 
  ggplot(aes(label, sample.weights, fill = condition)) +
  geom_col() +
  geom_hline(yintercept = 1, linetype = 2, col = "grey30") +
  facet_wrap(~mouse, nrow = 1, scales = "free_x") +
  scale_x_discrete(labels = scales::label_wrap(5)) +
  scale_y_continuous(expand = expansion(c(0, 0.05))) +
  labs(
    x = "Sample", y = "Sample Weights", fill = "Cell Type"
  )
*Sample-level weights after running `voomWithQualityWeights` setting all samples as being drawn from the same condition. The ideal equal wweighting of 1 is shown as the dashed horizontal line, with those samples below this being assumed to be of lower quality than those above the line. Thw two previously identified DN samples were strongly down-weighted, as was the CD49b sample from M2*

Sample-level weights after running voomWithQualityWeights setting all samples as being drawn from the same condition. The ideal equal wweighting of 1 is shown as the dashed horizontal line, with those samples below this being assumed to be of lower quality than those above the line. Thw two previously identified DN samples were strongly down-weighted, as was the CD49b sample from M2

cont.matrix <- makeContrasts(
  DNvLAG3 = DN - LAG_3,
  DNvDP = DN - DP,
  DNvCD49b = DN - CD49b,
  LAG3vCD49b = LAG_3 - CD49b,
  CD49bvDP = CD49b - DP,
  LAG3vDP = LAG_3 - DP,
  levels = X
)
fit <- lmFit(v, design = X, block = v$targets$mouse, correlation = rho) %>% 
  contrasts.fit(cont.matrix) %>%
  # treat()
  eBayes()

Summary Table

top_tables <- colnames(cont.matrix) %>% 
  lapply(function(x) topTable(fit, coef = x, number = Inf)) %>%
  # lapply(function(x) topTreat(fit, coef = x, number = Inf)) %>%
  lapply(as_tibble) %>%
  lapply(mutate, DE = adj.P.Val < 0.05 & abs(logFC) > 1) %>% 
  setNames(colnames(cont.matrix))
top_tables %>% 
  lapply(
    function(x){
      df <- dplyr::filter(x,DE)
      tibble(
        Up = sum(df$logFC > 0),
        Down = sum(df$logFC < 0),
        `Total DE` = Up + Down
      )
    }
  ) %>% 
  bind_rows(.id = "Comparison") %>% 
  pander(
    justify = "lrrr",
    caption = "Results from each comparison, where genes are considered DE using an FDR < 0.05 along with an estimated logFC beyond $\\pm1$."
  )
Results from each comparison, where genes are considered DE using an FDR < 0.05 along with an estimated logFC beyond \(\pm1\).
Comparison Up Down Total DE
DNvLAG3 46 31 77
DNvDP 27 34 61
DNvCD49b 16 2 18
LAG3vCD49b 111 56 167
CD49bvDP 6 32 38
LAG3vDP 16 17 33

MA Plots

DN Vs. LAG3

top_tables$DNvLAG3 %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. LAG3") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. LAG3. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for DN Vs. LAG3. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

DN Vs. DP

top_tables$DNvDP %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for DN Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

DN Vs. CD49b

top_tables$DNvCD49b %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. CD49b") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

LAG3 Vs. CD49b

top_tables$LAG3vCD49b %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("LAG3 Vs. CD49b") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for LAG3 Vs. CD49b. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for LAG3 Vs. CD49b. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

LAG3 Vs. DP

top_tables$LAG3vDP %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("LAG3 Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for LAG3 Vs. DP The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for LAG3 Vs. DP The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

CD49b Vs. DP

top_tables$CD49bvDP %>% 
  ggplot(aes(AveExpr, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_smooth(se = FALSE) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC > 0) %>% 
      arrange(desc(logFC)) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE, logFC < 0) %>% 
      arrange(logFC) %>% 
      dplyr::slice(1:5),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("CD49b Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
MA-Plot for CD49b Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

MA-Plot for CD49b Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

Volcano Plots

DN Vs. LAG3

top_tables$DNvLAG3 %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. LAG3") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. LAG3. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for DN Vs. LAG3. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

DN Vs. DP

top_tables$DNvDP %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for DN Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

DN Vs. CD49b

top_tables$DNvCD49b %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("DN Vs. CD49b") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.

LAG3 Vs. CD49b

top_tables$LAG3vCD49b %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("LAG3 Vs. CD49b") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for LAG3 Vs. CD49b. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for LAG3 Vs. CD49b. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

LAG3 Vs. DP

top_tables$LAG3vDP %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("LAG3 Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for LAG3 Vs. DP The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for LAG3 Vs. DP The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

CD49b Vs. DP

top_tables$CD49bvDP %>% 
  ggplot(aes(logFC, -log10(P.Value))) +
  geom_point(aes(colour = DE), alpha = 0.6) +
  geom_label_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% 
      dplyr::filter(DE) %>% 
      arrange(P.Value) %>% 
      dplyr::slice(1:10),
    max.overlaps = Inf,
    show.legend = FALSE
  ) +
  ggtitle("CD49b Vs. DP") +
  scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for CD49b Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.

Volcano Plot for CD49b Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.